Aside: Baseline of Zero?

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point() +
  xlim(0,60) +
  ylim(0,25)
## Warning: Removed 2 rows containing missing values (geom_point).

Section 5.1: Billboard

## install.packages("billboard")
library(billboard)
head(wiki_hot_100s)
##   no                     title              artist year
## 1  1 Theme from A Summer Place         Percy Faith 1960
## 2  2          He'll Have to Go          Jim Reeves 1960
## 3  3             Cathy's Clown The Everly Brothers 1960
## 4  4              Running Bear      Johnny Preston 1960
## 5  5                Teen Angel        Mark Dinning 1960
## 6  6                 I'm Sorry          Brenda Lee 1960
tail(wiki_hot_100s)
##       no                   title                             artist year
## 5696  95 Adventure of a Lifetime                           Coldplay 2016
## 5697  96         Humble and Kind                         Tim McGraw 2016
## 5698  97                  Wicked                             Future 2016
## 5699  98           Tiimmy Turner                          Desiigner 2016
## 5700  99           See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100                 Perfect                      One Direction 2016
max(wiki_hot_100s$year)
## [1] "2016"
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tidyverse)

top10 <- wiki_hot_100s %>%
  filter(year >= 2000 & year <= 2009) %>% 
  group_by(artist) %>%
  summarise(nsongs = n()) %>%
  arrange(desc(nsongs)) %>% 
  slice(1:10) %>%
  mutate(artist = fct_reorder(artist, nsongs))

ggplot(data = top10, aes(x = artist, y = nsongs)) +
  geom_bar(stat = "identity") +
  coord_flip()

5.1.1 Exercises
Exercise 2. There is a minor flaw in the way that we counted up the number of hits for each artist. Examine the 2nd to last row of the original data set with tail() to look at this potential flaw. What do you find?

tail(wiki_hot_100s)
##       no                   title                             artist year
## 5696  95 Adventure of a Lifetime                           Coldplay 2016
## 5697  96         Humble and Kind                         Tim McGraw 2016
## 5698  97                  Wicked                             Future 2016
## 5699  98           Tiimmy Turner                          Desiigner 2016
## 5700  99           See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100                 Perfect                      One Direction 2016
### you find that there are two artists on that one song, which was not taken into account when we added the number of hits for the artist. This would make it so these two artists would lose a song due to this one not counting because the artist is different.



Exercise 4. Change the plot from Exercise 1 to be a Lollipop chart using this website as a reference. Why might the lollipop chart be better than a bar plot?

ggplot(data = top10, aes(x = artist, y = nsongs)) +
  geom_point() +
  geom_segment(aes(x = artist, xend = artist, y = 0, yend = nsongs)) +
  coord_flip()


A lollipop chart might be better than a bar plot because there is a point on the total number of songs they have, so it may be easier to see the output and compare between artists.

Exercise 5. Use this website to customize the end points of your lollipop chart. If you have time, you can explore the other customization options. Make it look fancy!

ggplot(data = top10, aes(x = artist, y = nsongs)) +
  geom_point(size = 2, color = "red", fill = alpha("red", 0.3), alpha = 0.7, shape = 21, stroke = 2) +
  geom_segment(aes(x = artist, xend = artist, y = 0, yend = nsongs)) +
  coord_flip()

## provide the URL and name it something (in this case, url).
## paste0 pastes together the base URL and the year into a single string:
## this will be useful in a moment
year <- 2017

## convert the html code into something R can read

webpage <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>% 
  httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>% 
  read_html() 

## grabs the tables
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
  mutate(year = 2017)
df
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
get_wiki_100 <- function(year) {
  
  ## same code as before, replacing 2017 with year.
webpage <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>% 
  httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>% 
  read_html()   
  
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
    mutate(year = year)
  
  ## tell our function to return the dataframe `df`
  return(df) 
}
get_wiki_100(year = 2017)
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
library(purrr)
year_list <- list(2017, 2018, 2019, 2020, 2021)
year_list
## [[1]]
## [1] 2017
## 
## [[2]]
## [1] 2018
## 
## [[3]]
## [1] 2019
## 
## [[4]]
## [1] 2020
## 
## [[5]]
## [1] 2021
df_all <- map(year_list, get_wiki_100)
df_all ## a list of data frames, one for each year
## [[1]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
## 
## [[2]]
## # A tibble: 100 × 4
##      No. Title                `Artist(s)`                                year
##    <int> <chr>                <chr>                                     <dbl>
##  1     1 "\"God's Plan\""     Drake                                      2018
##  2     2 "\"Perfect\""        Ed Sheeran                                 2018
##  3     3 "\"Meant to Be\""    Bebe Rexha featuring Florida Georgia Line  2018
##  4     4 "\"Havana\""         Camila Cabello featuring Young Thug        2018
##  5     5 "\"Rockstar\""       Post Malone featuring 21 Savage            2018
##  6     6 "\"Psycho\""         Post Malone featuring Ty Dolla Sign        2018
##  7     7 "\"I Like It\""      Cardi B, Bad Bunny and J Balvin            2018
##  8     8 "\"The Middle\""     Zedd, Maren Morris and Grey                2018
##  9     9 "\"In My Feelings\"" Drake                                      2018
## 10    10 "\"Girls Like You\"" Maroon 5 featuring Cardi B                 2018
## # … with 90 more rows
## 
## [[3]]
## # A tibble: 100 × 4
##      No. Title               `Artist(s)`                          year
##    <int> <chr>               <chr>                               <dbl>
##  1     1 "\"Old Town Road\"" Lil Nas X featuring Billy Ray Cyrus  2019
##  2     2 "\"Sunflower\""     Post Malone and Swae Lee             2019
##  3     3 "\"Without Me\""    Halsey                               2019
##  4     4 "\"Bad Guy\""       Billie Eilish                        2019
##  5     5 "\"Wow\""           Post Malone                          2019
##  6     6 "\"Happier\""       Marshmello and Bastille              2019
##  7     7 "\"7 Rings\""       Ariana Grande                        2019
##  8     8 "\"Talk\""          Khalid                               2019
##  9     9 "\"Sicko Mode\""    Travis Scott                         2019
## 10    10 "\"Sucker\""        Jonas Brothers                       2019
## # … with 90 more rows
## 
## [[4]]
## # A tibble: 100 × 4
##      No. Title                   `Artist(s)`                   year
##    <int> <chr>                   <chr>                        <dbl>
##  1     1 "\"Blinding Lights\""   The Weeknd                    2020
##  2     2 "\"Circles\""           Post Malone                   2020
##  3     3 "\"The Box\""           Roddy Ricch                   2020
##  4     4 "\"Don't Start Now\""   Dua Lipa                      2020
##  5     5 "\"Rockstar\""          DaBaby featuring Roddy Ricch  2020
##  6     6 "\"Adore You\""         Harry Styles                  2020
##  7     7 "\"Life Is Good\""      Future featuring Drake        2020
##  8     8 "\"Memories\""          Maroon 5                      2020
##  9     9 "\"The Bones\""         Maren Morris                  2020
## 10    10 "\"Someone You Loved\"" Lewis Capaldi                 2020
## # … with 90 more rows
## 
## [[5]]
## # A tibble: 100 × 4
##      No. Title                                `Artist(s)`                   year
##    <int> <chr>                                <chr>                        <dbl>
##  1     1 "\"Levitating\""                     Dua Lipa                      2021
##  2     2 "\"Save Your Tears\""                The Weeknd and Ariana Grande  2021
##  3     3 "\"Blinding Lights\""                The Weeknd                    2021
##  4     4 "\"Mood\""                           24kGoldn featuring Iann Dior  2021
##  5     5 "\"Good 4 U\""                       Olivia Rodrigo                2021
##  6     6 "\"Kiss Me More\""                   Doja Cat featuring SZA        2021
##  7     7 "\"Leave the Door Open\""            Silk Sonic (Bruno Mars and …  2021
##  8     8 "\"Drivers License\""                Olivia Rodrigo                2021
##  9     9 "\"Montero (Call Me by Your Name)\"" Lil Nas X                     2021
## 10    10 "\"Peaches\""                        Justin Bieber featuring Dan…  2021
## # … with 90 more rows
df_2017_present <- bind_rows(df_all)
df_2017_present <- df_2017_present %>%
  mutate(Title = str_remove_all(Title, pattern = "\"")) %>% ## get rid of \ in title
  rename(no = No., 
         title = Title, 
         artist = `Artist(s)`) ## make column names match with billboard package

wiki_tibble <- as_tibble(wiki_hot_100s) %>% ## convert billboard data to tibble
  mutate(year = as.numeric(year),
         no = as.integer(no)) ## change variable types to match with scraped data
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
hot100_df <- bind_rows(wiki_tibble, df_2017_present)

Exercise 6. Use the hot100_df to make either a bar plot or a lollipop chart of the most popular artists of the 2010s (2010 through 2019). It may be helpful to make this plot without looking back at the code for the 2000s plot until you get stuck.

top15_df <- hot100_df %>% filter(year >= 2010 & year <= 2019) %>% 
  group_by(artist) %>%
  summarise(nsongs = n()) %>%
  arrange(desc(nsongs)) %>% 
  slice(1:15) %>%
  mutate(nsongs_ordered = fct_reorder(artist, nsongs))
ggplot(data = top15_df, aes(x = nsongs_ordered, y = nsongs)) +
  geom_point() +
  geom_segment(aes(x = nsongs_ordered, xend = nsongs_ordered, y = 0, yend = nsongs)) +
  coord_flip() +
  labs(x = "artist",
       y = "number of songs")

Exercise 7. Much of the code to scrape the data, using purrr to iterate over the scrape, and then combining the list of data frames to a single data frame may be new. It is not expected that you are able to write this code on your own, but you should have an overall understanding of what the code is doing. Write 2-3 sentences that summarizes the overall purpose of the rvest and purrr code.
This code grabs the data from the internet in an html file and then turns it into something that r can read. Then it grabs the table from the site and returns it in r in a better version for r to work with.

5.2 More tidyverse Review: Happy Planet Index

library(tidyverse)
hpi_df <- read_csv("data/hpi-tidy.csv")
## Rows: 151 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, GovernanceRank, Region
## dbl (8): HPIRank, LifeExpectancy, Wellbeing, HappyLifeYears, Footprint, Happ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hpi_df
## # A tibble: 151 × 11
##    HPIRank Country     LifeExpectancy Wellbeing HappyLifeYears Footprint
##      <dbl> <chr>                <dbl>     <dbl>          <dbl>     <dbl>
##  1     109 Afghanistan           48.7      4.76           29.0     0.540
##  2      18 Albania               76.9      5.27           48.8     1.81 
##  3      26 Algeria               73.1      5.24           46.2     1.65 
##  4     127 Angola                51.1      4.21           28.2     0.891
##  5      17 Argentina             75.9      6.44           55.0     2.71 
##  6      53 Armenia               74.2      4.37           41.9     1.73 
##  7      76 Australia             81.9      7.41           65.5     6.68 
##  8      48 Austria               80.9      7.35           64.3     5.29 
##  9      80 Azerbaijan            70.7      4.22           39.1     1.97 
## 10     146 Bahrain               75.1      4.55           43.5     6.65 
## # … with 141 more rows, and 5 more variables: HappyPlanetIndex <dbl>,
## #   Population <dbl>, GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>

5.2.1 Making a Scatterplot and Labeling Points

ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point()

hpi_us <- hpi_df %>% filter(Country == "United States of America")
hpi_us
## # A tibble: 1 × 11
##   HPIRank Country              LifeExpectancy Wellbeing HappyLifeYears Footprint
##     <dbl> <chr>                         <dbl>     <dbl>          <dbl>     <dbl>
## 1     105 United States of Am…           78.5      7.16           61.3      7.19
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## #   GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label(data = hpi_us, aes(label = Country)) ## specify

## data = hpi_us so geom_label only uses the observation in hpi_us
library(ggrepel)
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_us, aes(label = Country)) +
  geom_point(data = hpi_us, size = 3, shape = 1) ## create a second point that is an open circle (shape = 1) with a larger size (size = 3) to surround the United States point on the scatterplot

Exercise 1. Change the code to label 3 countries of interest. Recall that you will need to use the | operator in the dplyr::filter() function.

label3 <- hpi_df %>% filter(Country == "New Zealand" | Country == "Australia" | Country == "Kenya")
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = label3, aes(label = Country)) +
  geom_point(data = label3, size = 3, shape = 1)

5.2.2 plotly to Lable Points Interactively

## install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point()
ggplotly(plot1)
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing,
                                   label = Country)) +
  geom_point()
ggplotly(plot1, tooltip = "label")
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_us, aes(label = Country)) +
  geom_point(data = hpi_us, size = 3, shape = 1) +
  labs(title = "Countries with a Higher Ecological Footprint Tend to Have Citizens with Higher Wellbeing", ## add title
    subtitle = "Wellbeing is on a 1-10 scale", ## add subtitle (smaller text size than the title)
    caption = "Data Source: http://happyplanetindex.org/countries", ## add caption to the bottom of the figure
    x = "Ecological Footprint", ## change x axis label
    y = "Wellbeing") ## change y axis label

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_brewer(palette = "Accent")

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_viridis_d(option = "plasma")

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears)) +
  geom_point() +
  facet_wrap( ~ Region)

library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point(aes(colour = species)) ## colour is good enough here
## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point() +
  facet_wrap( ~ species) ## faceting probably unnecessary: colour is better
## Warning: Removed 2 rows containing missing values (geom_point).

colour_bad <- tibble(x = rnorm(500, 0, 1), y = rnorm(500, 0, 1),
       groupvar = c(rep("A", 50), rep("B", 50),
                    rep("C", 50), rep("D", 50),
                    rep("E", 50), rep("F", 50), rep("G", 50),
                    rep("H", 50), rep("I", 50), rep("J", 50)))

ggplot(data = colour_bad, aes(x = x, y = y, colour = groupvar)) +
  geom_point() + ## can't distinguish anything really: colour is bad
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = colour_bad, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap( ~ groupvar) ## faceting better
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'